library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
Dataset downloaded from mgandal’s github repository.
# Load csvs
datExpr = read.csv('./../Data/inputData/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/inputData/RNAseq_ASD_datMeta.csv')
# 1. Group brain regions by lobes
# 2. Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers,
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
# 3. Transform Diagnosis into a factor variable with the first level being the control group
datMeta = datMeta %>% mutate(Brain_Region = Region %>% as.factor) %>%
mutate(Brain_lobe = ifelse(Brain_Region %in% c('BA4_6','BA9','BA24','BA44_45'), 'Frontal',
ifelse(Brain_Region %in% c('BA3_1_2_5', 'BA7'), 'Parietal',
ifelse(Brain_Region %in% c('BA38','BA39_40','BA20_37','BA41_42_22'),
'Temporal',
'Occipital')))) %>%
mutate(Batch = gsub('/', '.', RNAExtractionBatch %>% as.factor),
Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>%
dplyr::select(-Diagnosis_)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/inputData/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
rm(GO_annotations)
Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).
The dataset includes 63682 genes from 88 samples belonging to 41 different subjects
Counts distribution: More than half of the counts are zero and most of the counts are relatively low, but there are some very high outliers
counts = datExpr %>% melt
count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
max(counts$value)))
count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
| Statistic | Values |
|---|---|
| Min | 0.00 |
| 1st Quartile | 0.00 |
| Median | 0.00 |
| Mean | 564.09 |
| 3rd Quartile | 27.00 |
| Max | 27183314.00 |
rm(counts, count_distr)
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels
Annotate genes with Biotype labels:
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys (17 genes return multiple labels)
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4))
########################################################################################
# 1. Query archive version
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), values = rownames(datExpr), mart=mart) %>%
rename(external_gene_id = 'hgnc_symbol')
datGenes$length = datGenes$end_position - datGenes$start_position
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 0/63677 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 42904/63677 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart,
values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/',
sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9099/42904 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes=getinfo, filters=c('hgnc_symbol'), mart=mart,
values=missing_genes)
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 5712/7866 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 17 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'),
values = missing_ensembl_ids, mart=mart)
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 0/6648 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),1))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank())
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
rm(getinfo, mart, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
The neuronal function is obtained from the Gene Ontology annotations for each gene, defining each gene that contains the substring ‘neuron’ as having a neuronal function. The pipeline to process and clean each gene’s GO annotations can be found in ./../Data/inputData/genes_GO_annotations.csv
GO_annotations = read.csv('./../Data/inputData/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>% mutate('Neuronal'=1)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length
Considering all genes, this dataset contains 911 of the 912 SFARI genes
1.- Remove rows that don’t correspond to genes
to_keep = !is.na(datGenes$length)
Names of the rows removed: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
Removed 5 ‘genes’, 63677 remaining
2. Keep only protein coding genes
35% of the genes are protein coding genes
datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
kable_styling(full_width = F)
| . | Freq |
|---|---|
| protein_coding | 22543 |
| lncRNA | 12167 |
| processed_pseudogene | 10117 |
| unprocessed_pseudogene | 2547 |
| 1 | 2314 |
| miRNA | 2276 |
| misc_RNA | 2178 |
| snRNA | 2043 |
| pseudogene | 1410 |
| snoRNA | 1202 |
| lincRNA | 840 |
| transcribed_unprocessed_pseudogene | 682 |
| rRNA_pseudogene | 500 |
| transcribed_processed_pseudogene | 441 |
| antisense | 380 |
| 3 | 331 |
| 6 | 314 |
| IG_V_pseudogene | 254 |
| IG_V_gene | 179 |
| TR_V_gene | 146 |
| transcribed_unitary_pseudogene | 86 |
| TR_J_gene | 81 |
| unitary_pseudogene | 74 |
| processed_transcript | 72 |
| sense_intronic | 72 |
| IG_D_gene | 64 |
| rRNA | 49 |
| TR_V_pseudogene | 46 |
| sense_overlapping | 38 |
| scaRNA | 31 |
| polymorphic_pseudogene | 28 |
| 7 | 25 |
| IG_J_gene | 24 |
| IG_C_gene | 23 |
| Mt_tRNA | 22 |
| 4 | 17 |
| IG_C_pseudogene | 11 |
| TEC | 11 |
| TR_C_gene | 8 |
| 3prime_overlapping_ncrna | 6 |
| IG_J_pseudogene | 6 |
| ribozyme | 5 |
| TR_J_pseudogene | 5 |
| TR_D_gene | 3 |
| Mt_rRNA | 2 |
| 8 | 1 |
| translated_processed_pseudogene | 1 |
| translated_unprocessed_pseudogene | 1 |
| vaultRNA | 1 |
Most of the non-protein coding genes have very low levels of expression
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) +
geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
Filtering protein coding genes, we are left with 907 SFARI Genes (we lose 4 genes)
Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out
n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length
df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>%
dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost Genes') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000187951 | ARHGAP11B | 3 | lncRNA | 0 | 2 |
| ENSG00000251593 | MSNP1AS | 2 | processed_pseudogene | 0 | 12 |
| ENSG00000233067 | PTCHD1-AS | 2 | lncRNA | 0 | 3 |
| ENSG00000197558 | SSPO | 3 | transcribed_unitary_pseudogene | 0 | 4 |
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
Removed 41134 genes. 22543 remaining
3. Remove genes with low levels of expression
\(\qquad\) 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr) > 0
df = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums == 0 & !is.na(`gene-score`)) %>%
arrange(`gene-score`) %>% dplyr::select(-ensembl_gene_id) %>%
filter(!duplicated(`gene-symbol`), !`gene-symbol` %in% datGenes$hgnc_symbol[to_keep])
datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]
\(\qquad\) Removed 3368 genes. 19175 remaining
\(\qquad\) 904 SFARI genes remaining (we lost 3 genes)
n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length
df %>% dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>%
kable(caption='Lost Genes with Top Scores') %>% kable_styling(full_width = F)
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000235718 | MFRP | 2 | protein_coding | 0 | 6 |
| ENSG00000186393 | KRT26 | 3 | protein_coding | 0 | 2 |
| ENSG00000221888 | OR1C1 | 3 | protein_coding | 0 | 3 |
rm(df)
\(\qquad\) 3.2 Removing genes with a high percentage of zeros
\(\qquad\) Choosing the threshold:
\(\qquad\) Criteria for selecting the percentage of zeros threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)
\(\qquad\) - On the plot I’m using the “dual” of the maximum percentage of zeros, the minimum percentage of non zeros so the visualisation is more intuitive
\(\qquad\) - 75% seems to be a good threshold for the minimum percentage of non zeros, so 25% will be the maximum percentage of zeros allowed in a row
\(\qquad\) - The Mean vs SD plot doesn’t show all of the genes, a random sample was selected for the genes with higher level of expression so the visualisation wouldn’t be as heavy (and since we care about the genes with the lowest levels of expression, we aren’t losing important information)
\(\qquad\) - I’m only selecting a sample of the genes with higher levels of expression (>7) so the plot is not that heavy
datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = round(100*(88-c(seq(72,7,-5),5,3,2,0))/88)
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
rm(absadj, netsummary, ku, z.ku, to_keep)
# Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis)
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
} else {
new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())